packs<-c("tidyverse","caret","rmarkdown","rgeos",
"rnaturalearth","rnaturalearthdata","sf",
"ggspatial","maps","plotly")
sapply(packs,require,character=T)
## tidyverse caret rmarkdown rgeos
## TRUE TRUE TRUE TRUE
## rnaturalearth rnaturalearthdata sf ggspatial
## TRUE TRUE TRUE TRUE
## maps plotly
## TRUE TRUE
df<-read.csv("covid-19-all.csv")
us<-df[df$Country.Region == "US" & df$Latitude > 24 & df$Latitude < 54 &
df$Longitude < -62 & df$Longitude > -126,]
####Free memory by deleting the unused dataframe
rm(df)
####Create a basic plot of the counties in the US
plot(us$Longitude,us$Latitude)
## Analyze data attributes
####Create a new vector for the ranges of confirmed cases data
casesRange<-list(c(seq(1,49,1)),c(seq(50,99,1)),c(seq(100,199,1)),c(seq(200,499,1)),
c(seq(500,999,1)),c(seq(1000,1999,1)),c(seq(2000,9999,1)),c(0))
us$ConfirmedRange<-NA
us$ConfirmedRange[us$Confirmed %in% casesRange[[8]]]<-"0"
us$ConfirmedRange[us$Confirmed %in% casesRange[[1]]]<-"1-49"
us$ConfirmedRange[us$Confirmed %in% casesRange[[2]]]<-"50-99"
us$ConfirmedRange[us$Confirmed %in% casesRange[[3]]]<-"100-199"
us$ConfirmedRange[us$Confirmed %in% casesRange[[4]]]<-"200-499"
us$ConfirmedRange[us$Confirmed %in% casesRange[[5]]]<-"500-999"
us$ConfirmedRange[us$Confirmed %in% casesRange[[6]]]<-"1,000-1,999"
us$ConfirmedRange[us$Confirmed %in% casesRange[[7]]]<-"2,000-9,999"
us$ConfirmedRange[us$Confirmed >= 10000]<-"10,000+"
us$ConfirmedRange<-as.factor(us$ConfirmedRange)
levels(us$ConfirmedRange)
## [1] "0" "1-49" "1,000-1,999" "10,000+" "100-199"
## [6] "2,000-9,999" "200-499" "50-99" "500-999"
####Reorder the factor levels from least to greatest
us$ConfirmedRange<-factor(us$ConfirmedRange,levels = levels(us$ConfirmedRange)[c(1,2,8,5,7,9,3,6,4)])
####Plot the confirmed cases by their American Latitude/Longitude
colors<-c(rgb(127,255,0,maxColorValue = 255),rgb(34,139,34,maxColorValue = 255),rgb(0,100,0,maxColorValue = 255),
rgb(255,255,0,maxColorValue = 255),rgb(255,165,0,maxColorValue = 255),rgb(255,0,0,maxColorValue = 255),
rgb(139,0,0,maxColorValue = 255),rgb(128,0,128,maxColorValue = 255),rgb(0,0,0,maxColorValue = 255))
world <- ne_countries(scale = "medium", returnclass = "sf")
states <- st_as_sf(map("state", plot = FALSE, fill = TRUE))
states <- cbind(states, st_coordinates(st_centroid(states)))
## Warning in st_centroid.sf(states): st_centroid assumes attributes are constant
## over geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
ggplot(data = world)+
geom_sf()+
geom_sf(data = states, fill = NA)+
coord_sf(xlim = c(-126, -70), ylim = c(24, 50), expand = FALSE)+
geom_point(data = us, shape = 15, aes(x = Longitude, y = Latitude, group = Province.State,
color = ConfirmedRange), size = 1)+
scale_color_manual(values = colors)+
labs(x = "Latitude", y = "Longitude", title = "Confirmed Cases by State")+
theme_void()
## Next, we can look at the number of deaths per county in each state
####We can reuse the code from before changing it to count deaths instead in a new column
us$Deaths[is.na(us$Deaths)]<-0
us$DeathsRange<-NA
us$DeathsRange[us$Deaths %in% casesRange[[8]]]<-"0"
us$DeathsRange[us$Deaths %in% casesRange[[1]]]<-"1-49"
us$DeathsRange[us$Deaths %in% casesRange[[2]]]<-"50-99"
us$DeathsRange[us$Deaths %in% casesRange[[3]]]<-"100-199"
us$DeathsRange[us$Deaths %in% casesRange[[4]]]<-"200-499"
us$DeathsRange[us$Deaths %in% casesRange[[5]]]<-"500-999"
us$DeathsRange[us$Deaths %in% casesRange[[6]]]<-"1,000-1,999"
us$DeathsRange[us$Deaths %in% casesRange[[7]]]<-"2,000-9,999"
us$DeathsRange[us$Deaths >= 10000]<-"10,000+"
us$DeathsRange<-as.factor(us$DeathsRange)
us$DeathsRange<-factor(us$DeathsRange,levels = levels(us$DeathsRange)[c(1,2,8,5,7,9,3,6,4)])
####We can use largely the same plot code from before to plot the deaths
ggplot(data = world)+
geom_sf()+
geom_sf(data = states, fill = NA)+
coord_sf(xlim = c(-126, -70), ylim = c(24, 50), expand = FALSE)+
geom_point(data = us, shape = 15, aes(x = Longitude, y = Latitude, group = Province.State,
color = DeathsRange), size = 1)+
scale_color_manual(values = colors)+
labs(x = "Latitude", y = "Longitude", title = "Deaths by State")+
theme_void()
## Going beyond static images, We can also use plotly to generate an interactive plot to show multiple points at once. ## First, we need to create unique IDs for cities that have reported multiple times.
us$Loc<-paste(us$Latitude,us$Longitude)
us$Loc<-as.factor(us$Loc)
####Then we take only the highest number for cumulative confirmed, deaths, and recovered.
us$Date<-as.Date(us$Date)
usg<- us %>%
arrange(desc(Date)) %>%
group_by(Loc) %>%
top_n(1,Date) ####Select only the latest date for each location
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = toRGB("gray70"),
subunitcolor = toRGB("gray85"),
countrycolor = toRGB("gray85"),
countrywidth = 0.5,
subunitwidth = 0.5
)
fig <- plot_geo(usg, lat = ~Latitude, lon = ~Longitude, colors = colors)
fig <- fig %>% add_markers(
text = ~paste(Province.State,paste("Confirmed",Confirmed),paste("Deaths",Deaths),paste("Recovered",Recovered), sep = "<br />"),
color = ~ConfirmedRange,
symbol = I("square"),
size = I(8),
hoverinfo = "text",
)
fig <- fig %>% layout(
title = 'Coronavirus in the United States<br />(Hover for totals)', geo = g
)
##This interactive plotly figure displays the total confirmed cases, deaths, and recovered in the United States.
##The plot can be dragged and zoomed and data can be hidden by severity group.
fig